********************************************************************************
********************************************************************************
* Implements health regression discontinuity analysis
********************************************************************************
********************************************************************************

clear all

set maxvar 10000
set more off
capture log close
capture restore

global tmp "d:\tmp\PollutionHealth"
capture !mkdir $tmp

global pollutants "PM10 pm25"
global weather    "temp prcp"

global lg_mortality "lg_res_cvd lg_lungca lg_res_cvd_lungca"
global mortality    "m_res m_cvd m_lungca m_res_cvd m_res_cvd_lungca"



******************************************************************
*** Select 3-year period and spatial unit; prepares file paths ***
*** Mortality data are proprietary and can be accessed through ***
*** the Chinese Center for Disease Control and Prevention (as  ***
*** noted in the article's data availability statement. These  ***
*** data were merged with pollution, weather, and population.  ***
******************************************************************
*Select ONE
local analysisUnit "county"  
*local analysisUnit "town"  

*Select ONE
local yearStart 2013	
*local yearStart 2016

*Select ONE
local yearEnd   2015
*local yearEnd   2018

*Select ONE
local hetero "above65"
*local hetero "female"
*local hetero "male"

local pathFile1 "$tmp\HealthEstimate_"

global aggregateSample `pathFile1'`analysisUnit'_`yearStart'_`yearEnd'
global aggregateSample_hetero `pathFile1'`analysisUnit'_`yearStart'_`yearEnd'_`hetero'
global aggregateSample_urban `pathFile1'`analysisUnit'_`yearStart'_`yearEnd'_urban



*******************************************************************
*** Replicate Tables 4 and A24-A34 mortality RD estimates	  	***
*******************************************************************

/*** For Tables A35-A38, use county-level life expectancy generated as
	 explained in appendix pages 53-54. Run the following code replacing 
	 the dependent variable with life expectancy. ***/

use $aggregateSample, clear

drop if r_icd < 0.004 | r_icd > 0.02			// drop outliers


***Qin-Huai boundary***														

sum year dist_qinhuai_IZ $mortality $weather if heat_qinhuai_IZ == 1			
sum year dist_qinhuai_IZ $mortality $weather if heat_qinhuai_IZ == 0			

*Running variable
gen     runvar =  dist_qinhuai_IZ if heat_qinhuai_IZ==1
replace runvar = -dist_qinhuai_IZ if heat_qinhuai_IZ==0
*Quadratic
gen     runvar_2 = runvar * runvar
replace runvar_2 = -runvar_2      if heat_qinhuai_IZ==0
*Cubic
gen runvar_3 = runvar * runvar * runvar
*Interaction
gen north_dist1 = heat_qinhuai_IZ * runvar
gen north_dist2 = heat_qinhuai_IZ * runvar_2 
gen north_dist3 = heat_qinhuai_IZ * runvar_3

*Column 1
foreach y of global lg_mortality {	
	reg `y' heat_qinhuai_IZ runvar north_dist1 [aw = pop_total], robust
	}

*Column 2
foreach y of global lg_mortality {
	reg `y' heat_qinhuai_IZ runvar north_dist1 $weather [aw = pop_total], robust
	}

*Column 3
foreach y of global lg_mortality {
	reg `y' heat_qinhuai_IZ runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3 [aw = pop_total], robust
	}

*Column 4
foreach y of global lg_mortality {	
	reg `y' heat_qinhuai_IZ runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3 $weather [aw = pop_total], robust
	}

*Column 5
foreach y of global lg_mortality {	
	reg `y' heat_qinhuai_IZ runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3 $weather [aw = pop_total] if dist_qinhuai_IZ<=500, robust
	}
sum $mortality if dist_qinhuai_IZ <= 500
	
*Column 6
foreach y of global lg_mortality {
	rdrobust `y' runvar, c(0) p(1) kernel(tri) weights(pop_total)
	}

*Column 7
foreach y of global lg_mortality {
	rdrobust `y' runvar, c(0) p(1) kernel(tri) covs($weather) weights(pop_total)
	}
	
	// for heterogeneous analysis (age and gender) in Tables A32 and A33
	use $aggregateSample_hetero, clear
	foreach y of global lg_mortality {
		rdrobust `y' runvar, c(0) p(1) kernel(tri) covs($weather) weights(pop_`hetero')
		}
	// for heterogeneous analysis (urban vs. rural) in Table A34
	use $aggregateSample_urban, clear
	foreach y of global lg_mortality {
		rdrobust `y' runvar if urban == 1, c(0) p(1) kernel(tri) covs($weather) weights(pop_total)
		rdrobust `y' runvar if urban == 0, c(0) p(1) kernel(tri) covs($weather) weights(pop_total)
		}

*Column 8
foreach y of global lg_mortality {
	rdrobust `y' runvar, c(0) p(1) kernel(tri) h(100) covs($weather) weights(pop_total)
	}

*Column 9
foreach y of global lg_mortality {
	rdrobust `y' runvar if dist_qinhuai_IZ<=500, c(0) p(1) kernel(tri) covs($weather) weights(pop_total)
	}
	
	
***Alternative boundary***														

sum year dist_real $mortality $weather if heat_real==1							
sum year dist_real $mortality $weather if heat_real==0

capture drop runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3
*Running variable
gen     runvar =  dist_real  if heat_real==1
replace runvar = -dist_real  if heat_real==0
*Quadratic
gen     runvar_2 = runvar * runvar
replace runvar_2 = -runvar_2 if heat_real==0
*Cubic
gen runvar_3 = runvar * runvar * runvar
*Interaction
gen north_dist1 = heat_real * runvar
gen north_dist2 = heat_real * runvar_2 
gen north_dist3 = heat_real * runvar_3	

*Column 1				
foreach y of global lg_mortality {	
	reg `y' heat_real runvar north_dist1 [aw = pop_total], robust 
	}

*Column 2
foreach y of global lg_mortality {
	reg `y' heat_real runvar north_dist1 $weather [aw = pop_total], robust
	}

*Column 3
foreach y of global lg_mortality {
	reg `y' heat_real runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3 [aw = pop_total], robust
	}

*Column 4
foreach y of global lg_mortality {	
	reg `y' heat_real runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3 $weather [aw = pop_total], robust
	}

*Column 5
foreach y of global lg_mortality {	
	reg `y' heat_real runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3 $weather [aw = pop_total] if dist_real<=500, robust
	}
sum $mortality if dist_real<=500
	
*Column 6
foreach y of global lg_mortality {
	rdrobust `y' runvar, c(0) p(1) kernel(tri) [aw = pop_total]
	}

*Column 7
foreach y of global lg_mortality {
	rdrobust `y' runvar, c(0) p(1) kernel(tri) covs($weather) [aw = pop_total]
	}
	
	// for heterogeneous analysis (age and gender) in Tables A32 and A33
	use $aggregateSample_hetero, clear
	foreach y of global lg_mortality {
		rdrobust `y' runvar, c(0) p(1) kernel(tri) covs($weather) weights(pop_`hetero')
		}
	// for heterogeneous analysis (urban vs. rural) in Table A34
	use $aggregateSample_urban, clear
	foreach y of global lg_mortality {
		rdrobust `y' runvar if urban == 1, c(0) p(1) kernel(tri) covs($weather) weights(pop_total)
		rdrobust `y' runvar if urban == 0, c(0) p(1) kernel(tri) covs($weather) weights(pop_total)
		}

*Column 8
foreach y of global lg_mortality {
	rdrobust `y' runvar, c(0) p(1) kernel(tri) h(100) covs($weather) [aw = pop_total]
	}

*Column 9
foreach y of global lg_mortality {
	rdrobust `y' runvar if dist_real<=500, c(0) p(1) kernel(tri) covs($weather) [aw = pop_total]
	}
	
	
	
*******************************************************************
*** Replicate Tables A39 to A46	pollution-mortality 2SLS 		***
*******************************************************************
	
/*** A39 (county) and A43 (town), cubic-in-distance controls ***/	

*Select ONE
local analysisUnit "county"  
*local analysisUnit "town"  

*Select ONE
local yearStart 2013	
*local yearStart 2016

*Select ONE
local yearEnd   2015
*local yearEnd   2018

local pathFile1 "$tmp\HealthEstimate_"

global aggregateSample `pathFile1'`analysisUnit'_`yearStart'_`yearEnd'_2SLS

global weather      "temp prcp"
global lg_mortality "lg_res_cvd lg_lungca lg_res_cvd_lungca"
global mortality    "m_res m_cvd m_lungca m_res_cvd m_res_cvd_lungca"

drop if r_icd < 0.004 | r_icd > 0.02			// drop outliers

use $aggregateSample, clear


***Qin-Huai boundary***														

*Running variable
gen     runvar =  dist_qinhuai_IZ if heat_qinhuai_IZ==1
replace runvar = -dist_qinhuai_IZ if heat_qinhuai_IZ==0
*Quadratic
gen     runvar_2 = runvar * runvar
replace runvar_2 = -runvar_2      if heat_qinhuai_IZ==0
*Cubic
gen runvar_3 = runvar * runvar * runvar
*Interaction
gen north_dist1 = heat_qinhuai_IZ * runvar
gen north_dist2 = heat_qinhuai_IZ * runvar_2 
gen north_dist3 = heat_qinhuai_IZ * runvar_3

* Tables A39 and A43 3rd poly with weather control and population weights
	
foreach y of global mortality {	
	ivreg2 `y' (pm25 = heat_qinhuai_IZ) runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3 $weather [aw = pop_total], robust
	ivreg2 `y' (PM10 = heat_qinhuai_IZ) runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3 $weather [aw = pop_total], robust
	}

sum m_res_cvd m_lungca m_res_cvd_lungca 

* Tables A40 and A44 local linear with bandwidth chosen as 300 km

gen t = 300 - dist_qinhuai_IZ
gen w = 0 if t < 0
replace w = t if t >= 0
gen pop_total_new = w * pop_total		// generate triangle weights

foreach y of global mortality {	
	ivreg2 `y' (pm25 = heat_qinhuai_IZ) runvar north_dist1 $weather [aw = pop_total_new] if runvar <= 300 & runvar >= -300, robust
	ivreg2 `y' (PM10 = heat_qinhuai_IZ) runvar north_dist1 $weather [aw = pop_total_new] if runvar <= 300 & runvar >= -300, robust
	}

sum m_res_cvd m_lungca m_res_cvd_lungca if runvar <= 300 & runvar >= -300

* Tables A41 and A45 local linear with the optimal bandwidth for the corresponding mortality RD

local b = xxx		// xxx is the optimal bandwidth for the corresponding mortality RD reported in Tables A24 to A31
cap drop t w pop_total_new
gen t = `b' - dist_qinhuai_IZ
gen w = 0 if t < 0
replace w = t if t >= 0
gen pop_total_new = w * pop_total		// generate triangle weights

foreach y of global mortality {	
	ivreg2 `y' (pm25 = heat_qinhuai_IZ) runvar north_dist1 $weather [aw = pop_total_new] if runvar <= `b' & runvar >= -`b', robust
	ivreg2 `y' (PM10 = heat_qinhuai_IZ) runvar north_dist1 $weather [aw = pop_total_new] if runvar <= `b' & runvar >= -`b', robust
	}

sum m_res_cvd m_lungca m_res_cvd_lungca if runvar <= `b' & runvar >= -`b'

* Tables A42 and A46 fuzzy RD

foreach y of global mortality {	
	rdrobust `y' runvar, c(0) p(1) kernel(tri) covs($weather) weights(pop_total) fuzzy(pm25)
	sum `y' if runvar <= `e(h_r)' & runvar >= -`e(h_l)'

	rdrobust `y' runvar, c(0) p(1) kernel(tri) covs($weather) weights(pop_total) fuzzy(PM10)
	sum `y' if runvar <= `e(h_r)' & runvar >= -`e(h_l)'
	}
	

***Alternative boundary***														

capture drop runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3 t w pop_total_new
*Running variable
gen     runvar =  dist_real  if heat_real==1
replace runvar = -dist_real  if heat_real==0
*Quadratic
gen     runvar_2 = runvar * runvar
replace runvar_2 = -runvar_2 if heat_real==0
*Cubic
gen runvar_3 = runvar * runvar * runvar
*Interaction
gen north_dist1 = heat_real * runvar
gen north_dist2 = heat_real * runvar_2 
gen north_dist3 = heat_real * runvar_3	

* Tables A39 and A43 3rd poly with weather control and population weights
	
foreach y of global mortality {	
	ivreg2 `y' (pm25 = heat_real) runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3 $weather [aw = pop_total], robust
	
	ivreg2 `y' (PM10 = heat_real) runvar runvar_2 runvar_3 north_dist1 north_dist2 north_dist3 $weather [aw = pop_total], robust
	}

sum m_res_cvd m_lungca m_res_cvd_lungca 

* Tables A40 and A44 local linear with bandwidth chosen as 300 km

gen t = 300 - dist_real
gen w = 0 if t < 0
replace w = t if t >= 0
gen pop_total_new = w * pop_total		// generate triangle weights

foreach y of global mortality {	
	ivreg2 `y' (pm25 = heat_real) runvar north_dist1 $weather [aw = pop_total_new] if runvar <= 300 & runvar >= -300, robust
	
	ivreg2 `y' (PM10 = heat_real) runvar north_dist1 $weather [aw = pop_total_new] if runvar <= 300 & runvar >= -300, robust
	}

sum m_res_cvd m_lungca m_res_cvd_lungca if runvar <= 300 & runvar >= -300

* Tables A41 and A45 local linear with the optimal bandwidth for the corresponding mortality RD 

local b = xxx		// xxx is the optimal bandwidth for the corresponding mortality RD reported in Tables A24 to A31
cap drop t w pop_total_new
gen t = `b' - dist_real
gen w = 0 if t < 0
replace w = t if t >= 0
gen pop_total_new = w * pop_total		// generate triangle weights

foreach y of global mortality {	
	ivreg2 `y' (pm25 = heat_real) runvar north_dist1 $weather [aw = pop_total_new] if runvar <= `b' & runvar >= -`b', robust
	
	ivreg2 `y' (PM10 = heat_real) runvar north_dist1 $weather [aw = pop_total_new] if runvar <= `b' & runvar >= -`b', robust
	}

sum m_res_cvd m_lungca m_res_cvd_lungca if runvar <= `b' & runvar >= -`b'

* Tables A42 and A46 fuzzy RD

foreach y of global mortality {	
	rdrobust `y' runvar, c(0) p(1) kernel(tri) covs($weather) weights(pop_total) fuzzy(pm25)
	sum `y' if runvar <= `e(h_r)' & runvar >= -`e(h_l)'
	
	rdrobust `y' runvar, c(0) p(1) kernel(tri) covs($weather) weights(pop_total) fuzzy(PM10)
	sum `y' if runvar <= `e(h_r)' & runvar >= -`e(h_l)'
	}
	
	

*******************************************************************
*** Replicate Fig. 7     					         		   	***
*******************************************************************

/*** Fig. 7 ***/
* Select ONE
use "$tmp\HealthEstimate_county_2013_2015", clear
*use "$tmp\HealthEstimate_county_2016_2018", clear

drop if r_icd < 0.004 | r_icd > 0.02			// drop outliers
gen m_res_cvd = (r_res + r_cvd) * 100000
gen m_lungca = r_lungca * 100000

* generate distance bins, every 20 km
gen bin = .
replace bin = 1 if dist_real >= 2 & dist_real < 20 & heat_real == 1
replace bin = 2 if dist_real >= 20 & dist_real < 40 & heat_real == 1
replace bin = 3 if dist_real >= 40 & dist_real < 60 & heat_real == 1
replace bin = 4 if dist_real >= 60 & dist_real < 80 & heat_real == 1
replace bin = 5 if dist_real >= 80 & dist_real < 100 & heat_real == 1
replace bin = 6 if dist_real >= 100 & dist_real < 120 & heat_real == 1
replace bin = 7 if dist_real >= 120 & dist_real < 140 & heat_real == 1
replace bin = 8 if dist_real >= 140 & dist_real < 160 & heat_real == 1
replace bin = 9 if dist_real >= 160 & dist_real < 180 & heat_real == 1
replace bin = 10 if dist_real >= 180 & dist_real < 200 & heat_real == 1
replace bin = 11 if dist_real >= 200 & dist_real < 220 & heat_real == 1
replace bin = 12 if dist_real >= 220 & dist_real < 240 & heat_real == 1
replace bin = 13 if dist_real >= 240 & dist_real < 260 & heat_real == 1
replace bin = 14 if dist_real >= 260 & dist_real < 280 & heat_real == 1
replace bin = 15 if dist_real >= 280 & dist_real < 300 & heat_real == 1
replace bin = -1 if dist_real > 0 & dist_real < 20  & heat_real == 0
replace bin = -2 if dist_real >= 20 & dist_real < 40 & heat_real == 0
replace bin = -3 if dist_real >= 40 & dist_real < 60 & heat_real == 0
replace bin = -4 if dist_real >= 60 & dist_real < 80 & heat_real == 0
replace bin = -5 if dist_real >= 80 & dist_real < 100 & heat_real == 0
replace bin = -6 if dist_real >= 100 & dist_real < 120 & heat_real == 0
replace bin = -7 if dist_real >= 120 & dist_real < 140 & heat_real == 0
replace bin = -8 if dist_real >= 140 & dist_real < 160 & heat_real == 0
replace bin = -9 if dist_real >= 160 & dist_real < 180 & heat_real == 0
replace bin = -10 if dist_real >= 180 & dist_real < 200 & heat_real == 0
replace bin = -11 if dist_real >= 200 & dist_real < 220 & heat_real == 0
replace bin = -12 if dist_real >= 220 & dist_real < 240 & heat_real == 0
replace bin = -13 if dist_real >= 240 & dist_real < 260 & heat_real == 0
replace bin = -14 if dist_real >= 260 & dist_real < 280 & heat_real == 0
replace bin = -15 if dist_real >= 280 & dist_real < 300 & heat_real == 0

* for each distance bin, generate total population
bysort bin: egen totalpop_sum = sum(pop_total)
replace totalpop_sum = totalpop_sum/10^5

tempfile t1
save `t1', replace

* collapse to distance bin
collapse (mean) m_res_cvd m_lungca heat_real, by(bin)
tsset bin
tsfill
drop if bin == 0
drop if bin == .
gen extra = 1

gen dist = .
replace dist = -290 if bin == -15
replace dist = -270 if bin == -14
replace dist = -250 if bin == -13
replace dist = -230 if bin == -12
replace dist = -210 if bin == -11
replace dist = -190 if bin == -10
replace dist = -170 if bin == -9
replace dist = -150 if bin == -8
replace dist = -130 if bin == -7
replace dist = -110 if bin == -6
replace dist = -90 if bin == -5
replace dist = -70 if bin == -4
replace dist = -50 if bin == -3
replace dist = -30 if bin == -2
replace dist = -10 if bin == -1
replace dist = 10 if bin == 1
replace dist = 30 if bin == 2
replace dist = 50 if bin == 3
replace dist = 70 if bin == 4
replace dist = 90 if bin == 5
replace dist = 110 if bin == 6
replace dist = 130 if bin == 7
replace dist = 150 if bin == 8
replace dist = 170 if bin == 9
replace dist = 190 if bin == 10
replace dist = 210 if bin == 11
replace dist = 230 if bin == 12
replace dist = 250 if bin == 13
replace dist = 270 if bin == 14
replace dist = 290 if bin == 15

* append back
append using `t1'
replace extra = 0 if extra == .
tab extra
count if totalpop_sum == .
bysort bin: egen mtotalpop = mean(totalpop_sum)

* bandwidth select
global weather "temp prcp"
gen runvar = dist_real
replace runvar = -dist_real if heat_real == 0

rdbwselect m_res_cvd runvar, c(0) p(1) kernel(tri) bwselect(mserd) covs($weather) weights(pop_total)
local h_res = `e(h_mserd)'

rdbwselect m_lungca runvar, c(0) p(1) kernel(tri) bwselect(mserd) covs($weather) weights(pop_total)
local h_lungca = `e(h_mserd)'

* keep samples within 300 km
keep if dist_real <= 300 | extra == 1

* graph: res/cvd
lpoly m_res_cvd dist_real if heat_real == 1, degree(1) kernel(tri) bwidth(`h_res') gen(rsc_bin_north rsc_lpoly_north)
lpoly m_res_cvd dist_real if heat_real == 0, degree(1) kernel(tri) bwidth(`h_res') gen(rsc_bin_south rsc_lpoly_south)
replace rsc_bin_south = -rsc_bin_south

sum rsc_bin_north
replace rsc_bin_north = 0 if rsc_bin_north < 8
sum rsc_bin_south
replace rsc_bin_south = 0 if rsc_bin_south > -7 & rsc_bin_south <0

# delimit ;
	scatter m_res_cvd dist [aw = mtotalpop] if extra == 1 & heat_real == 0, sort mcolor(blue) msize(medium) mfcolor(white) xlabel(-300(50)300) ylabel(, nogrid) 
	|| scatter m_res_cvd dist [aw = mtotalpop] if extra == 1 & heat_real == 1, sort mcolor(green) msize(medium) mfcolor(white) msymbol(circle)
	|| line rsc_lpoly_north rsc_bin_north, lwidth(1) sort lcolor(gray)
	|| line rsc_lpoly_south rsc_bin_south
	, xline(0, lcolor(black)) lwidth(1) sort lcolor(gray) 
	xtitle("Shortest distance to the boundary (km, positive if on north side)", size(medsmall))
	ytitle("Annual resp./CVD deaths per 100,000")
	title("County-level resp./CVD against distance to the alternative boundary", size(medsmall) color(black)) scheme(s1manual)
	legend(rows(2) size(medium) order(1 "resp./CVD in South" 2 "resp./CVD in North" 3 "Local Linear Regression") symxsize(medium))
	;
# delimit cr

* graph: lungca
lpoly m_lungca dist_real if heat_real == 1, degree(1) kernel(tri) bwidth(`h_lungca') gen(lungca_bin_north lungca_lpoly_north)
lpoly m_lungca dist_real if heat_real == 0, degree(1) kernel(tri) bwidth(`h_lungca') gen(lungca_bin_south lungca_lpoly_south)
replace lungca_bin_south = -lungca_bin_south

sum lungca_bin_north
replace lungca_bin_north = 0 if lungca_bin_north < 8
sum lungca_bin_south
replace lungca_bin_south = 0 if lungca_bin_south > -7 & lungca_bin_south <0

# delimit ;
	scatter m_lungca dist [aw = mtotalpop] if extra == 1 & heat_real == 0, sort mcolor(blue) msize(medium) mfcolor(white) xlabel(-300(50)300) ylabel(, nogrid) 
	|| scatter m_lungca dist [aw = mtotalpop] if extra == 1 & heat_real == 1, sort mcolor(green) msize(medium) mfcolor(white) msymbol(circle)
	|| line lungca_lpoly_north lungca_bin_north, lwidth(1) sort lcolor(gray)
	|| line lungca_lpoly_south lungca_bin_south
	, xline(0, lcolor(black)) lwidth(1) sort lcolor(gray) 
	xtitle("Shortest distance to the boundary (km, positive if on north side)", size(medsmall))
	ytitle("Annual lung cancer deaths per 100,000")
	title("County-level lung cancer against distance to the alternative boundary", size(medsmall) color(black)) scheme(s1manual)
	legend(rows(2) size(medium) order(1 "Lung cancer in South" 2 "Lung cancer in North" 3 "Local Linear Regression") symxsize(medium))
	;
# delimit cr